Modelo Poisson de Mistura

Gerando os jogos

num_teams <- 20

# Dicionario para relacionar o id de um time com seu nome
team_names <- c("Dragões do Sertão", "Atlético Rio Vermelho", "Borborema",
                "Guerreiros da Mata", "Cacique", "Aurora Litorânea",
                "Gávea Azul", "Mandacaru United", "Capibaribe", 
                "Índios Tupiniquins", "Atlético Taquara Verde", "Seriema",
                "Blumenau City", "Iguaçu", "Atlético Palmares",
                "Serra Dourada", "Sambaqui", "Pampa", 
                "Riacho do Meio", "Sport Club Xingu")


games <- data.frame(
  h = rep(1:num_teams, each = num_teams),
  a = rep(1:num_teams, times = num_teams)
)

games <- games[games$h != games$a, ]

Definindo os parâmetros dos dados gerados:

set.seed(28)

beta_0 <- -0.1
home_effect <- 0.35
sd_att <- c(0.2, 0.01, 0.2)
sd_def <- c(0.2, 0.01, 0.2)

pi_att <- c(0.45, 0.1, 0.45)
pi_def <- c(0.4, 0.2, 0.4)

mu_att <- c(-0.4, 0, 0.4)
mu_def <- c(0.5, 0, -0.5)

att_category <- sample(1:3, 20, replace = T, prob = pi_att)
def_category <- sample(1:3, 20, replace = T, prob = pi_def)

att_effects <- rnorm(num_teams, mu_att[att_category], sd_att)
def_effects <- rnorm(num_teams, mu_def[def_category], sd_def)

Esses foram os efeitos de ataque e defesa gerados:

Gerando os resultados dos jogos

set.seed(40)

simulate_games <- function(games){
  num_games <- length(games$h)
  home_team <- games$h
  away_team <- games$a
  
  theta_1 <- beta_0 + home_effect + att_effects[home_team] + def_effects[away_team]
  theta_2 <- beta_0 + att_effects[away_team] + def_effects[home_team]
  
  y1 <- rpois(num_games, exp(theta_1))
  y2 <- rpois(num_games, exp(theta_2))
  
  games$y1 <- y1
  games$y2 <- y2  
  
  return(games)
}

games <- simulate_games(games)
num_games <- nrow(games)

Estimando os parâmetros com o STAN

data <- c(G = num_games, T = num_teams, as.list(games))

model <- stan_model("./model/poisson_mistura_simple.stan")

iter <- 10000
fit <- sampling(model, data = data, iter = iter, chains = 2, cores = 2)
Warning: There were 9316 divergent transitions after warmup. See
https://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
to find out why this is a problem and how to eliminate them.
Warning: Examine the pairs() plot to diagnose sampling problems
Warning: The largest R-hat is 2.13, indicating chains have not mixed.
Running the chains for more iterations may help. See
https://mc-stan.org/misc/warnings.html#r-hat
Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
Running the chains for more iterations may help. See
https://mc-stan.org/misc/warnings.html#bulk-ess
Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
Running the chains for more iterations may help. See
https://mc-stan.org/misc/warnings.html#tail-ess
data {
  int<lower=1> G; // Numero total de jogos
  int<lower=1> T; // Numero de total times
  int<lower=0, upper=T> h[G]; // Indice do time que joga em casa no G-esimo jogo
  int<lower=0, upper=T> a[G]; // Indice do time que joga como visitante no G-esimo jogo
  int<lower=0> y1[G]; // Numero de gols marcados no G-esimo jogo pelo time que joga em casa
  int<lower=0> y2[G]; // Numero de gols marcados no G-esimo jogo pelo time que joga como visitante
}

parameters {
  real home;
  real mu;
  real<upper=-0.25> mu_att1;
  real<lower=0.25> mu_att3;
  real<lower=0.25> mu_def1;
  real<upper=-0.25> mu_def3;
  real<lower=0> sigma_att[3];
  real<lower=0> sigma_def[3];
  vector[T] att;
  vector[T] def;
  simplex[3] pi_att;
  simplex[3] pi_def;
}

model {
  real m_att[3];
  real m_def[3];
  real aux;

  mu ~ normal(0, 10);
  home ~ normal(0, 10);
  mu_att1 ~ normal(0, 10);
  mu_att3 ~ normal(0, 10);
  mu_def1 ~ normal(0, 10);
  mu_def3 ~ normal(0, 10);
  
  sigma_att ~ cauchy(0, 0.05);
  sigma_def ~ cauchy(0, 0.05);
  
  pi_att ~ dirichlet(rep_vector(0.001, 3));
  pi_def ~ dirichlet(rep_vector(0.001, 3));
  
  for (t in 1:T) {
    m_att[1] = log(pi_att[1]) + normal_lpdf(att[t] |  mu_att1, sigma_att[1]) - 
                                normal_lcdf(-0.2 | mu_att1, sigma_att[1]);
                                
    m_att[2] = log(pi_att[2]) + normal_lpdf(att[t] |  0, 0.01) - 
               log_diff_exp(normal_lcdf(0.2 | 0, 0.01), 
                           normal_lcdf(-0.2 | 0, 0.01));

    m_att[3] = log(pi_att[3]) + normal_lpdf(att[t] |  mu_att3, sigma_att[3]) - 
                                normal_lccdf(0.2 | mu_att3, sigma_att[3]);
    
    m_def[1] = log(pi_def[1]) + normal_lpdf(def[t] |  mu_def1, sigma_def[1]) -
                                normal_lccdf(0.2 | mu_def1, sigma_def[1]);
                                
    m_def[2] = log(pi_def[2]) + normal_lpdf(def[t] |  0, 0.01)  - 
               log_diff_exp(normal_lcdf(0.2 | 0, 0.01), 
                           normal_lcdf(-0.2 | 0, 0.01));
                           
    m_def[3] = log(pi_def[3]) + normal_lpdf(def[t] |  mu_def3, sigma_def[3]) - 
                                normal_lcdf(-0.2 | mu_def3, sigma_def[3]);
    
    target += log_sum_exp(m_att) + log_sum_exp(m_def);
  }
  
  for (g in 1:G) {
    y1[g] ~ poisson_log(mu + home + att[h[g]] + def[a[g]]);
    y2[g] ~ poisson_log(mu + att[a[g]] + def[h[g]]);
  }
  
}

generated quantities {
 vector[G] y1_tilde;
 vector[G] y2_tilde;
 vector[G] log_lik;
 for (g in 1:G) {
   y1_tilde[g] = poisson_log_rng(mu + home + att[h[g]] + def[a[g]]);
   y2_tilde[g] = poisson_log_rng(mu + att[a[g]] + def[h[g]]);
   log_lik[g] = poisson_log_lpmf(y1[g] | mu + home + att[h[g]] + def[a[g]]) + poisson_log_lpmf(y2[g] | mu + att[a[g]] + def[h[g]]);
 } 
}

Analisando possíveis problemas

Traceplot

Visualizando os valores estimados para 100 campeonatos